F1 Project Data Modeling

Author

Cole Wagner

Code
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.3.0     ✔ tune         1.2.1
✔ infer        1.0.7     ✔ workflows    1.1.4
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.2.1     ✔ yardstick    1.3.1
✔ recipes      1.1.0     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
Code
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
Code
library(rstanarm)
Loading required package: Rcpp

Attaching package: 'Rcpp'

The following object is masked from 'package:rsample':

    populate

This is rstanarm version 2.32.1
- See https://mc-stan.org/rstanarm/articles/priors for changes to default priors!
- Default priors may change, so it's safest to specify priors, even if equivalent to the defaults.
- For execution on a local, multicore CPU with excess RAM we recommend calling
  options(mc.cores = parallel::detectCores())
Code
library(multilevelmod)
library(loo)
This is loo version 2.8.0
- Online documentation and vignettes at mc-stan.org/loo
- As of v2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session. 
Code
library(zoo)

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
Code
library(bayesplot)
This is bayesplot version 1.11.1
- Online documentation and vignettes at mc-stan.org/bayesplot
- bayesplot theme set to bayesplot::theme_default()
   * Does _not_ affect other ggplot2 plots
   * See ?bayesplot_theme_set for details on theme setting
Code
library(rstan)
Loading required package: StanHeaders

rstan version 2.32.6 (Stan version 2.32.2)

For execution on a local, multicore CPU with excess RAM we recommend calling
options(mc.cores = parallel::detectCores()).
To avoid recompilation of unchanged Stan programs, we recommend calling
rstan_options(auto_write = TRUE)
For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
change `threads_per_chain` option:
rstan_options(threads_per_chain = 1)


Attaching package: 'rstan'

The following object is masked from 'package:tidyr':

    extract
Code
library(shinystan)
Loading required package: shiny

Attaching package: 'shiny'

The following object is masked from 'package:infer':

    observe


This is shinystan version 2.6.0
Code
library(broom.mixed)
library(ranger)
library(finetune)
library(xgboost)

Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice
Code
library(kableExtra)

Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows
Code
library(iml)
set.seed(1)

full_no_missing <- read_csv("../Data/full_no_missing.csv")
Rows: 4398 Columns: 34
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (10): FULL_NAME, LOCATION, COUNTRY_NAME, CIRCUIT_SHORT_NAME, SESSION_TY...
dbl  (21): MEETING_KEY, SESSION_KEY, DRIVER_NUMBER, I1_SPEED, I2_SPEED, ST_S...
dttm  (3): DATE_START_LAP, DATE_START_SESSION, DATE_END_SESSION

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
full_clean_types <- full_no_missing %>%
  # Automatic conversion
  type_convert() %>%
  # Convert some numeric variables to factors
  mutate(
    MEETING_KEY = as.factor(MEETING_KEY),
    SESSION_KEY = as.factor(SESSION_KEY),
    DRIVER_NUMBER = as.factor(DRIVER_NUMBER),
    RAINFALL = as.factor(RAINFALL)
  )

── Column specification ────────────────────────────────────────────────────────
cols(
  FULL_NAME = col_character(),
  LOCATION = col_character(),
  COUNTRY_NAME = col_character(),
  CIRCUIT_SHORT_NAME = col_character(),
  SESSION_TYPE = col_character(),
  SESSION_NAME = col_character(),
  GMT_OFFSET = col_character(),
  CIRCUIT = col_character(),
  TYPE = col_character(),
  DIRECTION = col_character()
)

Apply the 107% rule to filter out “non-hot” laps

Code
# Create a column to show if there was rain at any point in the session
# Also, find the cutoff time for each session
rain_cutoff <- full_clean_types %>%
  # Filter out any sessions that had rain
  group_by(SESSION_KEY) %>%
  summarise(
    rain_laps = sum(RAINFALL == 1),
    cutoff = min(LAP_DURATION) * 1.07
  ) %>%
  ungroup()

full_eng <- left_join(full_clean_types, rain_cutoff) %>%
  filter(LAP_DURATION < cutoff)
Joining with `by = join_by(SESSION_KEY)`

Get Final Modeling Data

Select only the variables that will be used in the model.

Code
modeling_data <- full_eng %>%
  # Keep only the variables needed for modeling
  select(c(8, 13:17, 19, 23, 33:35))

Base Model Pipeline

Code
base_rec <- recipe(LAP_DURATION ~ ., data = modeling_data) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors(), one_hot = F)

base_juice <- juice(prep(base_rec))

# Choose repeated 10-fold CV
kfold_rep <- vfold_cv(modeling_data, v = 10, repeats = 2, strata = "LAP_DURATION")

# Create linear regression model framework
f1_lm <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

# Build the modeling workflow
lm_workflow <- workflow() %>%
  add_model(f1_lm) %>%
  add_recipe(base_rec)

# Resampled error rate
lm_res <- fit_resamples(
  lm_workflow,
  resamples = kfold_rep,
  metrics = metric_set(rmse, mae, rsq)
)
→ A | warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
There were issues with some computations   A: x1
There were issues with some computations   A: x20
Code
lm_res %>% collect_metrics()
# A tibble: 3 × 6
  .metric .estimator  mean     n  std_err .config             
  <chr>   <chr>      <dbl> <int>    <dbl> <chr>               
1 mae     standard   0.939    20 0.00927  Preprocessor1_Model1
2 rmse    standard   1.34     20 0.0156   Preprocessor1_Model1
3 rsq     standard   0.987    20 0.000372 Preprocessor1_Model1
Code
# Extract coefficients
lm_coefs <- lm_workflow %>%
  fit(modeling_data) %>%
  extract_fit_parsnip() %>%
  tidy()

Check assumptions

Code
lm_native <- extract_fit_engine(fit(lm_workflow, modeling_data))

plot(lm_native)

Code
# Normality is not great, but a log transformation makes it worse.
hist(modeling_data$LAP_DURATION)

Code
hist(log(modeling_data$LAP_DURATION))

Code
i <- 1
for (x in modeling_data[, -8]) {
  plot(
    x = x, y = modeling_data$LAP_DURATION,
    xlab = colnames(modeling_data[, -8])[i]
  )
  i <- i + 1
}

After looking at these plots, it appears that OLS linear regression is not the most appropriate model type for this data. Most of the predictors do not appear to be linearly related to the outcome, and the assumptions, especially the normality assumption, is shaky. As a result, I want to try two other frameworks: A Bayesian linear regression with random intercepts, and tree-based methods (random forest and xgboost). I do not want to throw out the results from the OLS regression, though, because the model is overall very good and has consistent performance across folds. As a result, I don’t see these results as invalid.

Bayesian Linear Regression with Random Intercepts

Based on the fact that each track is so unique, my background knowledge leads me to believe that a random intercept for each track could improve model performance.

Code
randint_rec <- recipe(LAP_DURATION ~ ., data = modeling_data) %>%
  # This adds random intercept for circuit
  add_role(CIRCUIT_SHORT_NAME, new_role = "exp_unit") %>%
  step_normalize(all_numeric_predictors(), -has_role("exp_unit")) %>%
  step_dummy(all_nominal_predictors(), -has_role("exp_unit"))

# Set up the Bayesian engine and priors
glmer_mod <- linear_reg() %>%
  set_engine("stan_glmer",
    cores = 4
  )

randint_wf <- workflow() %>%
  add_recipe(randint_rec) %>%
  # Need to specify formula for the model
  add_model(glmer_mod, formula = LAP_DURATION ~ AIR_TEMPERATURE + HUMIDITY +
    PRESSURE + RAINFALL_X1 + TRACK_TEMPERATURE + WIND_SPEED + TURNS +
    LENGTH_KM + rain_laps + (1 | CIRCUIT_SHORT_NAME))

# Resampled error rate
# randint_res <- fit_resamples(
#   randint_wf,
#   resamples = kfold_rep,
#   metrics = metric_set(rmse, mae, rsq)
# )
# saveRDS(randint_res, "../Data/randint_res.rds")
randint_res <- readRDS("../Data/randint_res.rds")
# Look at performance metrics
randint_res %>% collect_metrics()
# A tibble: 3 × 6
  .metric .estimator  mean     n  std_err .config             
  <chr>   <chr>      <dbl> <int>    <dbl> <chr>               
1 mae     standard   0.936    20 0.00925  Preprocessor1_Model1
2 rmse    standard   1.34     20 0.0157   Preprocessor1_Model1
3 rsq     standard   0.987    20 0.000375 Preprocessor1_Model1
Code
# Extract native model fit
# randint_extract <- extract_fit_engine(randint_wf %>% fit(modeling_data))
# saveRDS(randint_extract, "../Data/randint_extract.rds")
randint_extract <- readRDS("../Data/randint_extract.rds")

# launch_shinystan(randint_extract)

After examining diagnostics via shinystan, the assumptions seem to be met.

Tree-Based Methods

Random Forest

Code
# Create a random forest model
rf_mod <- rand_forest(
  trees = tune(),
  mtry = tune(),
  min_n = tune()
) %>%
  set_mode("regression") %>%
  set_engine("ranger", num.threads = 7)

rf_wf <- workflow() %>%
  add_recipe(base_rec) %>%
  add_model(rf_mod)

# Create a grid of tuning parameters
rf_param <- extract_parameter_set_dials(rf_mod) %>%
  update(mtry = mtry(c(1, 32)))

rf_tunegrid <- grid_space_filling(rf_param, size = 5)

# Tune the model
# rf_tune <- tune_race_anova(
#   rf_wf,
#   resamples = kfold_rep,
#   grid = rf_tunegrid,
#   metrics = metric_set(rmse, mae, rsq)
# )
#
# saveRDS(rf_tune, "../Data/rf_tune.rds")
rf_tune <- readRDS("../Data/rf_tune.rds")

rf_besttune <- select_best(rf_tune, metric = "rmse")

rf_finalmod <- finalize_model(rf_mod, rf_besttune)

rf_finalwf <- workflow() %>%
  add_recipe(base_rec) %>%
  add_model(rf_finalmod)

rbind(rf_tune %>% show_best(metric = "rmse"),
rf_tune %>% show_best(metric = "mae"),
rf_tune %>% show_best(metric = "rsq"))
Warning: Metric "rmse" was used to evaluate model candidates in the race but "mae" has
been chosen to rank the candidates. These results may not agree with the race.
Warning: Metric "rmse" was used to evaluate model candidates in the race but "rsq" has
been chosen to rank the candidates. These results may not agree with the race.
# A tibble: 3 × 9
   mtry trees min_n .metric .estimator  mean     n   std_err .config            
  <int> <int> <int> <chr>   <chr>      <dbl> <int>     <dbl> <chr>              
1    24  2000    21 rmse    standard   0.522    20 0.00796   Preprocessor1_Mode…
2    24  2000    21 mae     standard   0.378    20 0.00363   Preprocessor1_Mode…
3    24  2000    21 rsq     standard   0.998    20 0.0000649 Preprocessor1_Mode…

XGBoost

Code
xgb_mod <- boost_tree(
  mode = "regression",
  mtry = tune(),
  trees = tune(),
  min_n = tune(),
  tree_depth = tune(),
  learn_rate = tune(),
  loss_reduction = tune(),
  sample_size = tune(),
  stop_iter = tune()
) %>%
  set_engine("xgboost", num.threads = 7)

xgb_param <- extract_parameter_set_dials(xgb_mod) %>%
  update(mtry = mtry(c(1, 32)))

xgb_tunegrid <- grid_space_filling(xgb_param, size = 5)
Warning: Due to the small size of the grid, a Latin hypercube design will be
used.
Code
xgb_wf <- workflow() %>%
  add_recipe(base_rec) %>%
  add_model(xgb_mod)


# xgb_tune <- tune_race_anova(
#   xgb_wf,
#   resamples = kfold_rep,
#   grid = xgb_tunegrid,
#   metrics = metric_set(rmse, mae, rsq)
# )
# 
# saveRDS(xgb_tune, "../Data/xgb_tune.rds")
xgb_tune <- readRDS("../Data/xgb_tune.rds")

xgb_besttune <- select_best(xgb_tune, metric = "rmse")

xgb_finalmod <- finalize_model(xgb_mod, xgb_besttune)

xgb_finalwf <- workflow() %>%
  add_recipe(base_rec) %>%
  add_model(xgb_finalmod)

rbind(xgb_tune %>% show_best(metric = "rmse"),
xgb_tune %>% show_best(metric = "mae"),
xgb_tune %>% show_best(metric = "rsq"))
Warning: Metric "rmse" was used to evaluate model candidates in the race but "mae" has
been chosen to rank the candidates. These results may not agree with the race.
Warning: Metric "rmse" was used to evaluate model candidates in the race but "rsq" has
been chosen to rank the candidates. These results may not agree with the race.
# A tibble: 3 × 14
   mtry trees min_n tree_depth learn_rate loss_reduction sample_size stop_iter
  <int> <int> <int>      <int>      <dbl>          <dbl>       <dbl>     <int>
1    25   982    27          2      0.268      0.0000528       0.680         8
2    25   982    27          2      0.268      0.0000528       0.680         8
3    25   982    27          2      0.268      0.0000528       0.680         8
# ℹ 6 more variables: .metric <chr>, .estimator <chr>, mean <dbl>, n <int>,
#   std_err <dbl>, .config <chr>

Model Evaluation

Code
# format all metrics tables
lm_metrics <- lm_res %>%
  collect_metrics() %>%
  select(.metric, mean, std_err) %>%
  pivot_longer(-1) %>%
  pivot_wider(names_from = 1, values_from = value) %>%
  mutate(Model = c("Base Linear Regression (mean)", "Base Linear Regression (std)"), .before = name) %>%
  select(-name)

randint_metrics <- randint_res %>%
  collect_metrics() %>%
  select(.metric, mean, std_err) %>%
  pivot_longer(-1) %>%
  pivot_wider(names_from = 1, values_from = value) %>%
  mutate(Model = c("Bayesian LR w/ Random Intercept (mean)", "Bayesian LR w/ Random Intercept (std)"), .before = name) %>%
  select(-name)

rf_metrics <- rbind(rf_tune %>% show_best(metric = "rmse"),
rf_tune %>% show_best(metric = "mae"),
rf_tune %>% show_best(metric = "rsq")) %>%
  select(.metric, mean, std_err) %>%
  pivot_longer(-1) %>%
  pivot_wider(names_from = 1, values_from = value) %>%
  mutate(Model = c("Random Forest (mean)", "Random Forest (std)"), .before = name) %>%
  select(-name)
Warning: Metric "rmse" was used to evaluate model candidates in the race but "mae" has
been chosen to rank the candidates. These results may not agree with the race.
Warning: Metric "rmse" was used to evaluate model candidates in the race but "rsq" has
been chosen to rank the candidates. These results may not agree with the race.
Code
xgb_metrics <- rbind(xgb_tune %>% show_best(metric = "rmse"),
xgb_tune %>% show_best(metric = "mae"),
xgb_tune %>% show_best(metric = "rsq")) %>%
  select(.metric, mean, std_err) %>%
  pivot_longer(-1) %>%
  pivot_wider(names_from = 1, values_from = value) %>%
  mutate(Model = c("XGBoost (mean)", "XGBoost (std)"), .before = name) %>%
  select(-name)
Warning: Metric "rmse" was used to evaluate model candidates in the race but "mae" has
been chosen to rank the candidates. These results may not agree with the race.
Metric "rmse" was used to evaluate model candidates in the race but "rsq" has
been chosen to rank the candidates. These results may not agree with the race.
Code
# combine all metrics tables and create a kable object
comb_metrics <- rbind(lm_metrics, randint_metrics, rf_metrics, xgb_metrics) %>%
  rename(RMSE = rmse, MAE = mae, Rsquared = rsq)

metrics_table <- kable(comb_metrics, digits = 3) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
metrics_table
Model MAE RMSE Rsquared
Base Linear Regression (mean) 0.939 1.337 0.987
Base Linear Regression (std) 0.009 0.016 0.000
Bayesian LR w/ Random Intercept (mean) 0.936 1.337 0.987
Bayesian LR w/ Random Intercept (std) 0.009 0.016 0.000
Random Forest (mean) 0.378 0.522 0.998
Random Forest (std) 0.004 0.008 0.000
XGBoost (mean) 0.402 0.547 0.998
XGBoost (std) 0.004 0.008 0.000

Because the random forest model is best, I will further examine this model.

Random Forest Examination

Variable Importance Plot

Code
# Native random forest model
rf_extract <- extract_fit_engine(rf_finalwf %>% fit(modeling_data))

# Create prediction function
pfun_ranger <- function(object, newdata) predict(object, data = newdata)$predictions

# Create a base iml object that is needed before generating additional plots
predictor_ranger <- Predictor$new(model = rf_extract, 
                                  data = base_juice[,-c(9)], 
                                  y = base_juice[,9], predict.fun = pfun_ranger)

# imp_rf <- FeatureImp$new(predictor_ranger, loss = "rmse")
# saveRDS(imp_rf, "../Data/imp_rf.rds")
imp_rf <- readRDS("../Data/imp_rf.rds")
plot(imp_rf)

ALE Plots

Code
rf_ale_length <- FeatureEffect$new(predictor_ranger, feature = "LENGTH_KM", method='ale')
rf_ale_length$plot()

Code
rf_ale_turns <- FeatureEffect$new(predictor_ranger, feature = "TURNS", method='ale')
rf_ale_turns$plot()

Code
rf_ale_humidity <- FeatureEffect$new(predictor_ranger, feature = "HUMIDITY", method='ale')
rf_ale_humidity$plot()

Code
rf_ale_pressure <- FeatureEffect$new(predictor_ranger, feature = "PRESSURE", method='ale')
rf_ale_pressure$plot()

Code
rf_ale_airtemp <- FeatureEffect$new(predictor_ranger, feature = "AIR_TEMPERATURE", method='ale')
rf_ale_airtemp$plot()

Code
rf_ale_tracktemp <- FeatureEffect$new(predictor_ranger, feature = "TRACK_TEMPERATURE", method='ale')
rf_ale_tracktemp$plot()

Run Model With Only Weather Predictors

Code
# Remove track attributes from the data
weatheronly_data <- modeling_data[, -c(8:10)]

weatheronly_rec <- recipe(LAP_DURATION ~ ., data = weatheronly_data) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors(), one_hot = F)

weatheronly_juice <- juice(prep(weatheronly_rec))
  
# Create a random forest model
weatheronly_mod <- rand_forest(
  trees = tune(),
  mtry = tune(),
  min_n = tune()
) %>%
  set_mode("regression") %>%
  set_engine("ranger", num.threads = 7)

weatheronly_wf <- workflow() %>%
  add_recipe(weatheronly_rec) %>%
  add_model(weatheronly_mod)

# Create a grid of tuning parameters
weatheronly_param <- extract_parameter_set_dials(weatheronly_mod) %>%
  update(mtry = mtry(c(1, 7)))

weatheronly_tunegrid <- grid_space_filling(weatheronly_param, size = 5)

# Create a new CV
weatheronly_rep <- vfold_cv(weatheronly_data, v = 10, repeats = 2, strata = "LAP_DURATION")

# Tune the model
# weatheronly_tune <- tune_race_anova(
#   weatheronly_wf,
#   resamples = weatheronly_rep,
#   grid = weatheronly_tunegrid,
#   metrics = metric_set(yardstick::rmse, yardstick::mae, yardstick::rsq)
# )

# saveRDS(weatheronly_tune, "../Data/weatheronly_tune.rds")
weatheronly_tune <- readRDS("../Data/weatheronly_tune.rds")

weatheronly_besttune <- select_best(weatheronly_tune, metric = "rmse")

weatheronly_finalmod <- finalize_model(weatheronly_mod, weatheronly_besttune)

weatheronly_finalwf <- workflow() %>%
  add_recipe(weatheronly_rec) %>%
  add_model(weatheronly_finalmod)

rbind(weatheronly_tune %>% show_best(metric = "rmse"),
weatheronly_tune %>% show_best(metric = "mae"),
weatheronly_tune %>% show_best(metric = "rsq"))
Warning: Metric "rmse" was used to evaluate model candidates in the race but "mae" has
been chosen to rank the candidates. These results may not agree with the race.
Warning: Metric "rmse" was used to evaluate model candidates in the race but "rsq" has
been chosen to rank the candidates. These results may not agree with the race.
# A tibble: 3 × 9
   mtry trees min_n .metric .estimator  mean     n  std_err .config             
  <int> <int> <int> <chr>   <chr>      <dbl> <int>    <dbl> <chr>               
1     5  2000    21 rmse    standard   0.567    20 0.0137   Preprocessor1_Model4
2     5  2000    21 mae     standard   0.392    20 0.00507  Preprocessor1_Model4
3     5  2000    21 rsq     standard   0.998    20 0.000116 Preprocessor1_Model4

Variable Importance

Code
# Native random forest model
weatheronly_extract <- extract_fit_engine(weatheronly_finalwf %>% fit(weatheronly_data))

# Create prediction function
pfun_ranger <- function(object, newdata) predict(object, data = newdata)$predictions

# Create a base iml object that is needed before generating additional plots
predictor_ranger <- Predictor$new(model = weatheronly_extract, 
                                  data = weatheronly_juice[, -7], 
                                  y = weatheronly_juice[, 7], predict.fun = pfun_ranger)

# imp_weatheronly <- FeatureImp$new(predictor_ranger, loss = "rmse")
# saveRDS(imp_weatheronly, "../Data/imp_weatheronly.rds")
imp_weatheronly <- readRDS("../Data/imp_weatheronly.rds")
plot(imp_weatheronly)

ALE Plots

Code
weatheronly_ale_humidity <- FeatureEffect$new(predictor_ranger, feature = "HUMIDITY", method='ale')
weatheronly_ale_humidity$plot()

Code
weatheronly_ale_pressure <- FeatureEffect$new(predictor_ranger, feature = "PRESSURE", method='ale')
weatheronly_ale_pressure$plot()

Code
weatheronly_ale_airtemp <- FeatureEffect$new(predictor_ranger, feature = "AIR_TEMPERATURE", method='ale')
weatheronly_ale_airtemp$plot()

Code
weatheronly_ale_tracktemp <- FeatureEffect$new(predictor_ranger, feature = "TRACK_TEMPERATURE", method='ale')
weatheronly_ale_tracktemp$plot()

Remove Mexico City Laps Because of Extreme Low Pressure

Code
# Remove Mexico City laps from data
nomex_data <- modeling_data[modeling_data$CIRCUIT_SHORT_NAME != "Mexico City", -c(8:10)]

nomex_rec <- recipe(LAP_DURATION ~ ., data = nomex_data) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors(), one_hot = F)

nomex_juice <- juice(prep(nomex_rec))
  
# Create a random forest model
nomex_mod <- rand_forest(
  trees = tune(),
  mtry = tune(),
  min_n = tune()
) %>%
  set_mode("regression") %>%
  set_engine("ranger", num.threads = 7)

nomex_wf <- workflow() %>%
  add_recipe(nomex_rec) %>%
  add_model(nomex_mod)

# Create a grid of tuning parameters
nomex_param <- extract_parameter_set_dials(nomex_mod) %>%
  update(mtry = mtry(c(1, 7)))

nomex_tunegrid <- grid_space_filling(nomex_param, size = 5)

# Create a new CV
nomex_rep <- vfold_cv(nomex_data, v = 10, repeats = 2, strata = "LAP_DURATION")

# Tune the model
# nomex_tune <- tune_race_anova(
#   nomex_wf,
#   resamples = nomex_rep,
#   grid = nomex_tunegrid,
#   metrics = metric_set(yardstick::rmse, yardstick::mae, yardstick::rsq)
# )

# saveRDS(nomex_tune, "../Data/nomex_tune.rds")
nomex_tune <- readRDS("../Data/nomex_tune.rds")

nomex_besttune <- select_best(nomex_tune, metric = "rmse")

nomex_finalmod <- finalize_model(nomex_mod, nomex_besttune)

nomex_finalwf <- workflow() %>%
  add_recipe(nomex_rec) %>%
  add_model(nomex_finalmod)

rbind(nomex_tune %>% show_best(metric = "rmse"),
nomex_tune %>% show_best(metric = "mae"),
nomex_tune %>% show_best(metric = "rsq"))
Warning: Metric "rmse" was used to evaluate model candidates in the race but "mae" has
been chosen to rank the candidates. These results may not agree with the race.
Warning: Metric "rmse" was used to evaluate model candidates in the race but "rsq" has
been chosen to rank the candidates. These results may not agree with the race.
# A tibble: 3 × 9
   mtry trees min_n .metric .estimator  mean     n  std_err .config             
  <int> <int> <int> <chr>   <chr>      <dbl> <int>    <dbl> <chr>               
1     5  2000    21 rmse    standard   0.600    20 0.0311   Preprocessor1_Model4
2     5  2000    21 mae     standard   0.394    20 0.00662  Preprocessor1_Model4
3     5  2000    21 rsq     standard   0.997    20 0.000320 Preprocessor1_Model4

Variable Importance

Code
# Native random forest model
nomex_extract <- extract_fit_engine(nomex_finalwf %>% fit(nomex_data))

# Create prediction function
pfun_ranger <- function(object, newdata) predict(object, data = newdata)$predictions

# Create a base iml object that is needed before generating additional plots
predictor_ranger <- Predictor$new(model = nomex_extract, 
                                  data = nomex_juice[, -7], 
                                  y = nomex_juice[, 7], predict.fun = pfun_ranger)

# imp_nomex <- FeatureImp$new(predictor_ranger, loss = "rmse")
# saveRDS(imp_nomex, "../Data/imp_nomex.rds")
imp_nomex <- readRDS("../Data/imp_nomex.rds")
plot(imp_nomex)

ALE Plot for Pressure

Code
nomex_ale_pressure <- FeatureEffect$new(predictor_ranger, feature = "PRESSURE", method='ale')
nomex_ale_pressure$plot()